{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f6897f36",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from collections import Counter\n",
    "from  collections import defaultdict\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e1ca528e",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('./datasets/train.csv')\n",
    "test_df = pd.read_csv('./datasets/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e8f74922",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>label_desc</th>\n",
       "      <th>sentence</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>108</td>\n",
       "      <td>news_edu</td>\n",
       "      <td>上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>104</td>\n",
       "      <td>news_finance</td>\n",
       "      <td>商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>106</td>\n",
       "      <td>news_house</td>\n",
       "      <td>通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>112</td>\n",
       "      <td>news_travel</td>\n",
       "      <td>2018年去俄罗斯看世界杯得花多少钱？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>109</td>\n",
       "      <td>news_tech</td>\n",
       "      <td>剃须刀的个性革新，雷明登天猫定制版新品首发</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label    label_desc                                        sentence\n",
       "0   0    108      news_edu    上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？\n",
       "1   1    104  news_finance  商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告\n",
       "2   2    106    news_house                通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？\n",
       "3   3    112   news_travel                             2018年去俄罗斯看世界杯得花多少钱？\n",
       "4   4    109     news_tech                           剃须刀的个性革新，雷明登天猫定制版新品首发"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "daeb275b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "上联：泰山黄山赵本山，如何对下联？                               78\n",
       "上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？    61\n",
       "马云又出惊人言论，8年后房子最不值钱，他的话可信吗？                      50\n",
       "以色列警告称如果战机被击落将会轰炸俄军事基地，你怎么看？                    48\n",
       "如果美国打伊朗，最快可以多久解决战斗？                             46\n",
       "                                                ..\n",
       "2018朋友圈母亲节祝福语大全 2018母亲节说说/朋友圈发什么                 1\n",
       "男友和我分手时发给我一条短信，三年后我才看懂意思                         1\n",
       "京东长虹·美菱联手发布战略新品 瞄准智能家电市场                         1\n",
       "新加坡为什么要判定特斯拉不环保？                                 1\n",
       "极限竞速地平线3，原创娱乐游戏视频                                1\n",
       "Name: sentence, Length: 49726, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.sentence.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b085155c",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['sentence_len'] = train_df['sentence'].apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bd72247c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "482f60b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cut(sentence): return list(jieba.cut(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bb3b78a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ldq\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.271 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "train_df['words'] = train_df.sentence.apply(lambda s: ' '.join(cut(s)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "75e311ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>label_desc</th>\n",
       "      <th>sentence</th>\n",
       "      <th>sentence_len</th>\n",
       "      <th>words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>108</td>\n",
       "      <td>news_edu</td>\n",
       "      <td>上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？</td>\n",
       "      <td>44</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>104</td>\n",
       "      <td>news_finance</td>\n",
       "      <td>商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告</td>\n",
       "      <td>46</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>106</td>\n",
       "      <td>news_house</td>\n",
       "      <td>通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？</td>\n",
       "      <td>32</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>112</td>\n",
       "      <td>news_travel</td>\n",
       "      <td>2018年去俄罗斯看世界杯得花多少钱？</td>\n",
       "      <td>19</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>109</td>\n",
       "      <td>news_tech</td>\n",
       "      <td>剃须刀的个性革新，雷明登天猫定制版新品首发</td>\n",
       "      <td>21</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label    label_desc                                        sentence  \\\n",
       "0   0    108      news_edu    上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？   \n",
       "1   1    104  news_finance  商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告   \n",
       "2   2    106    news_house                通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？   \n",
       "3   3    112   news_travel                             2018年去俄罗斯看世界杯得花多少钱？   \n",
       "4   4    109     news_tech                           剃须刀的个性革新，雷明登天猫定制版新品首发   \n",
       "\n",
       "   sentence_len                                              words  \n",
       "0            44  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...  \n",
       "1            46  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...  \n",
       "2            32  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...  \n",
       "3            19                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？  \n",
       "4            21                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e8eb21a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df.label.value_counts()\n",
    "train_df['words_len'] = train_df['words'].apply(lambda s: len(s.split()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "2cf3dd0d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>label_desc</th>\n",
       "      <th>sentence</th>\n",
       "      <th>sentence_len</th>\n",
       "      <th>words</th>\n",
       "      <th>words_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>108</td>\n",
       "      <td>news_edu</td>\n",
       "      <td>上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？</td>\n",
       "      <td>44</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>104</td>\n",
       "      <td>news_finance</td>\n",
       "      <td>商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告</td>\n",
       "      <td>46</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>106</td>\n",
       "      <td>news_house</td>\n",
       "      <td>通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？</td>\n",
       "      <td>32</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>112</td>\n",
       "      <td>news_travel</td>\n",
       "      <td>2018年去俄罗斯看世界杯得花多少钱？</td>\n",
       "      <td>19</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>109</td>\n",
       "      <td>news_tech</td>\n",
       "      <td>剃须刀的个性革新，雷明登天猫定制版新品首发</td>\n",
       "      <td>21</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label    label_desc                                        sentence  \\\n",
       "0   0    108      news_edu    上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？   \n",
       "1   1    104  news_finance  商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告   \n",
       "2   2    106    news_house                通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？   \n",
       "3   3    112   news_travel                             2018年去俄罗斯看世界杯得花多少钱？   \n",
       "4   4    109     news_tech                           剃须刀的个性革新，雷明登天猫定制版新品首发   \n",
       "\n",
       "   sentence_len                                              words  words_len  \n",
       "0            44  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...         26  \n",
       "1            46  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...         20  \n",
       "2            32  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...         21  \n",
       "3            19                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？         10  \n",
       "4            21                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发         11  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0fc7a720",
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_df.words_len.value_counts()\n",
    "\n",
    "# np.std(train_df.words_len)\n",
    "\n",
    "train_df['words_keep'] = train_df['words'].apply(lambda s: ' '.join(s.split()[:20]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d8f1b07f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>label_desc</th>\n",
       "      <th>sentence</th>\n",
       "      <th>sentence_len</th>\n",
       "      <th>words</th>\n",
       "      <th>words_len</th>\n",
       "      <th>words_keep</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>108</td>\n",
       "      <td>news_edu</td>\n",
       "      <td>上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？</td>\n",
       "      <td>44</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "      <td>26</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>104</td>\n",
       "      <td>news_finance</td>\n",
       "      <td>商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告</td>\n",
       "      <td>46</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "      <td>20</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>106</td>\n",
       "      <td>news_house</td>\n",
       "      <td>通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？</td>\n",
       "      <td>32</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "      <td>21</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>112</td>\n",
       "      <td>news_travel</td>\n",
       "      <td>2018年去俄罗斯看世界杯得花多少钱？</td>\n",
       "      <td>19</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "      <td>10</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>109</td>\n",
       "      <td>news_tech</td>\n",
       "      <td>剃须刀的个性革新，雷明登天猫定制版新品首发</td>\n",
       "      <td>21</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "      <td>11</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label    label_desc                                        sentence  \\\n",
       "0   0    108      news_edu    上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？   \n",
       "1   1    104  news_finance  商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告   \n",
       "2   2    106    news_house                通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？   \n",
       "3   3    112   news_travel                             2018年去俄罗斯看世界杯得花多少钱？   \n",
       "4   4    109     news_tech                           剃须刀的个性革新，雷明登天猫定制版新品首发   \n",
       "\n",
       "   sentence_len                                              words  words_len  \\\n",
       "0            44  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...         26   \n",
       "1            46  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...         20   \n",
       "2            32  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...         21   \n",
       "3            19                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？         10   \n",
       "4            21                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发         11   \n",
       "\n",
       "                                          words_keep  \n",
       "0  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...  \n",
       "1  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...  \n",
       "2  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...  \n",
       "3                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？  \n",
       "4                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2634e9b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = train_df['words_keep'].tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d5444b68",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = list(map(lambda s: s.split(), sentences))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b1a47652",
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import reduce"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b9c79790",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_words = reduce(lambda a, b: a + b, sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f1c75794",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['上课时',\n",
       " '学生',\n",
       " '手机',\n",
       " '响个',\n",
       " '不停',\n",
       " '，',\n",
       " '老师',\n",
       " '一怒之下',\n",
       " '把',\n",
       " '手机',\n",
       " '摔',\n",
       " '了',\n",
       " '，',\n",
       " '家长',\n",
       " '拿',\n",
       " '发票',\n",
       " '让',\n",
       " '老师',\n",
       " '赔',\n",
       " '，',\n",
       " '商赢',\n",
       " '环球',\n",
       " '股份',\n",
       " '有限公司',\n",
       " '关于',\n",
       " '延期',\n",
       " '回复',\n",
       " '上海证券交易所',\n",
       " '对',\n",
       " '公司',\n",
       " '2017',\n",
       " '年',\n",
       " '年度报告',\n",
       " '的',\n",
       " '事后',\n",
       " '审核',\n",
       " '问询',\n",
       " '函',\n",
       " '的',\n",
       " '公告',\n",
       " '通过',\n",
       " '中介',\n",
       " '公司',\n",
       " '买',\n",
       " '了',\n",
       " '二手房',\n",
       " '，',\n",
       " '首付',\n",
       " '都',\n",
       " '付',\n",
       " '了',\n",
       " '，',\n",
       " '现在',\n",
       " '卖家',\n",
       " '不想',\n",
       " '卖',\n",
       " '了',\n",
       " '。',\n",
       " '怎么',\n",
       " '处理',\n",
       " '2018',\n",
       " '年',\n",
       " '去',\n",
       " '俄罗斯',\n",
       " '看',\n",
       " '世界杯',\n",
       " '得花',\n",
       " '多少',\n",
       " '钱',\n",
       " '？',\n",
       " '剃须刀',\n",
       " '的',\n",
       " '个性',\n",
       " '革新',\n",
       " '，',\n",
       " '雷明登',\n",
       " '天猫',\n",
       " '定制',\n",
       " '版',\n",
       " '新品',\n",
       " '首发',\n",
       " '再次',\n",
       " '证明',\n",
       " '了',\n",
       " '“',\n",
       " '无敌',\n",
       " '是',\n",
       " '多么',\n",
       " '寂寞',\n",
       " '”',\n",
       " '—',\n",
       " '—',\n",
       " '逆天',\n",
       " '的',\n",
       " '中国乒乓球队',\n",
       " '！',\n",
       " '三农',\n",
       " '盾',\n",
       " 'SACC',\n",
       " '-',\n",
       " '全球',\n",
       " '首个',\n",
       " '推出',\n",
       " '：',\n",
       " '互联网',\n",
       " '+',\n",
       " '区块',\n",
       " '链',\n",
       " '+',\n",
       " '农产品',\n",
       " '的',\n",
       " '电商',\n",
       " '平台',\n",
       " '重做',\n",
       " 'or',\n",
       " '新',\n",
       " '英雄',\n",
       " '？',\n",
       " '其实',\n",
       " '重做',\n",
       " '对',\n",
       " '暴雪',\n",
       " '来说',\n",
       " '同样',\n",
       " '重要',\n",
       " '如何',\n",
       " '在',\n",
       " '商业活动',\n",
       " '中不受',\n",
       " '人',\n",
       " '欺骗',\n",
       " '？',\n",
       " '87',\n",
       " '版',\n",
       " '红楼梦',\n",
       " '最',\n",
       " '温柔',\n",
       " '的',\n",
       " '四个',\n",
       " '丫鬟',\n",
       " '，',\n",
       " '娶',\n",
       " '谁',\n",
       " '都',\n",
       " '是',\n",
       " '一生',\n",
       " '的',\n",
       " '福气',\n",
       " '凌云',\n",
       " '研发',\n",
       " '的',\n",
       " '国产',\n",
       " '两轮',\n",
       " '电动车',\n",
       " '怎么样',\n",
       " '，',\n",
       " '有',\n",
       " '什么',\n",
       " '惊喜',\n",
       " '？',\n",
       " '房地产',\n",
       " '税',\n",
       " '迟迟',\n",
       " '无法',\n",
       " '出台',\n",
       " '？',\n",
       " '央行',\n",
       " '研究',\n",
       " '局局长',\n",
       " '徐忠',\n",
       " '这样',\n",
       " '说',\n",
       " '我',\n",
       " '四千',\n",
       " '一个月',\n",
       " '，',\n",
       " '老婆',\n",
       " '一千五',\n",
       " '一个月',\n",
       " '，',\n",
       " '存款',\n",
       " '八万',\n",
       " '且',\n",
       " '有',\n",
       " '两',\n",
       " '小孩',\n",
       " '，',\n",
       " '是',\n",
       " '先',\n",
       " '买房',\n",
       " '还是',\n",
       " '先买',\n",
       " '“',\n",
       " '产地',\n",
       " '办展',\n",
       " '”',\n",
       " '模式',\n",
       " '为',\n",
       " '“',\n",
       " '东莞',\n",
       " '制造',\n",
       " '”',\n",
       " '送',\n",
       " '创新',\n",
       " '情报',\n",
       " '全国',\n",
       " '首个',\n",
       " '央地',\n",
       " '融合',\n",
       " '平台',\n",
       " '在',\n",
       " '沪',\n",
       " '落地',\n",
       " '故事',\n",
       " '：',\n",
       " '刘',\n",
       " '主任',\n",
       " '建',\n",
       " '猪场',\n",
       " '什么',\n",
       " '是',\n",
       " '人情',\n",
       " '，',\n",
       " '什么',\n",
       " '是',\n",
       " '世故',\n",
       " '？',\n",
       " '「',\n",
       " '关注',\n",
       " '」',\n",
       " '网络',\n",
       " '自',\n",
       " '媒体',\n",
       " '不是',\n",
       " '“',\n",
       " '法外',\n",
       " '之',\n",
       " '地',\n",
       " '”',\n",
       " '，',\n",
       " '以谣',\n",
       " '博名',\n",
       " '、',\n",
       " '以谣',\n",
       " '博利',\n",
       " '将',\n",
       " '被',\n",
       " '古代',\n",
       " '先进',\n",
       " '文明',\n",
       " '的',\n",
       " '证据',\n",
       " '！',\n",
       " '这是',\n",
       " '历史',\n",
       " '上',\n",
       " '最',\n",
       " '著名',\n",
       " '的',\n",
       " '10',\n",
       " '把',\n",
       " '剑',\n",
       " '加快',\n",
       " '产城',\n",
       " '融合',\n",
       " '以',\n",
       " '科技',\n",
       " '创新',\n",
       " '引领',\n",
       " '新城区',\n",
       " '建设',\n",
       " '取名',\n",
       " '困难',\n",
       " '症',\n",
       " '患者',\n",
       " '皇马',\n",
       " '的',\n",
       " '贝尔',\n",
       " '，',\n",
       " '第一个',\n",
       " '受害者',\n",
       " '就是',\n",
       " '他',\n",
       " '的',\n",
       " '儿子',\n",
       " '夫妻间',\n",
       " '能',\n",
       " '不能',\n",
       " '互看',\n",
       " '手机',\n",
       " '？',\n",
       " '探秘',\n",
       " '、',\n",
       " '日本',\n",
       " '关东',\n",
       " '特大',\n",
       " '地震',\n",
       " '！',\n",
       " '上联',\n",
       " '：',\n",
       " '千峰',\n",
       " '入眠',\n",
       " '松涛',\n",
       " '静',\n",
       " '，',\n",
       " '怎么',\n",
       " '接',\n",
       " '下联',\n",
       " '？',\n",
       " '如何',\n",
       " '阻止',\n",
       " '基拉',\n",
       " '韦厄',\n",
       " '活火山',\n",
       " '的',\n",
       " '熔岩',\n",
       " '单硝酸',\n",
       " '异',\n",
       " '山梨',\n",
       " '酯',\n",
       " '片',\n",
       " '与',\n",
       " '硝酸',\n",
       " '异',\n",
       " '山梨',\n",
       " '酯',\n",
       " '片',\n",
       " '有何',\n",
       " '区别',\n",
       " '？',\n",
       " '廖英强',\n",
       " '被',\n",
       " '证监会',\n",
       " '处罚',\n",
       " '1.2',\n",
       " '亿',\n",
       " '，',\n",
       " '你',\n",
       " '怎么',\n",
       " '看',\n",
       " '？',\n",
       " '女儿',\n",
       " '高烧',\n",
       " '不止',\n",
       " '，',\n",
       " '我',\n",
       " '让',\n",
       " '婆婆',\n",
       " '给',\n",
       " '老公',\n",
       " '打电话',\n",
       " '回家',\n",
       " '，',\n",
       " '通话',\n",
       " '内容',\n",
       " '让',\n",
       " '我',\n",
       " '吓瘫',\n",
       " '在',\n",
       " '地',\n",
       " '上联',\n",
       " '：',\n",
       " '春风',\n",
       " '执笔',\n",
       " '谁',\n",
       " '研墨',\n",
       " '，',\n",
       " '怎么',\n",
       " '对',\n",
       " '下联',\n",
       " '？',\n",
       " '肥乡',\n",
       " '区',\n",
       " '：',\n",
       " '让',\n",
       " '文明',\n",
       " '新风',\n",
       " '吹进',\n",
       " '千家万户',\n",
       " '葫芦',\n",
       " '都',\n",
       " '能',\n",
       " '做成',\n",
       " '什么',\n",
       " '乐器',\n",
       " '？',\n",
       " '为什么',\n",
       " '袁大头',\n",
       " '等',\n",
       " '银元',\n",
       " '吹',\n",
       " '完会',\n",
       " '有',\n",
       " '响声',\n",
       " '？',\n",
       " '伊朗',\n",
       " '为什么',\n",
       " '宁愿',\n",
       " '接受',\n",
       " '制裁',\n",
       " '也',\n",
       " '不',\n",
       " '同意',\n",
       " '修改',\n",
       " '伊核',\n",
       " '协议',\n",
       " '？',\n",
       " '小白',\n",
       " '如何',\n",
       " '做好',\n",
       " '自',\n",
       " '媒体',\n",
       " '的',\n",
       " '几个',\n",
       " '信念',\n",
       " '？',\n",
       " '街头',\n",
       " '偶遇',\n",
       " '2018',\n",
       " '款',\n",
       " '长安',\n",
       " 'CS35',\n",
       " '，',\n",
       " '颜值',\n",
       " '美炸',\n",
       " '！',\n",
       " '或售',\n",
       " '6',\n",
       " '万起',\n",
       " '，',\n",
       " '还',\n",
       " '买',\n",
       " '宝骏',\n",
       " '510',\n",
       " '？',\n",
       " '学生',\n",
       " '党',\n",
       " '买',\n",
       " '什么',\n",
       " '笔记本电脑',\n",
       " '好',\n",
       " '？',\n",
       " '大猩猩',\n",
       " '对',\n",
       " '镜头',\n",
       " '摆',\n",
       " '造型',\n",
       " '表情丰富',\n",
       " '，',\n",
       " '走红',\n",
       " '网络',\n",
       " '收获',\n",
       " '众多',\n",
       " '迷妹',\n",
       " '11',\n",
       " '日',\n",
       " '游戏',\n",
       " '热点',\n",
       " '回顾',\n",
       " '：',\n",
       " '刺激',\n",
       " '战场',\n",
       " '：',\n",
       " '取消',\n",
       " '恶意',\n",
       " '伤害',\n",
       " '队友',\n",
       " '后',\n",
       " '，',\n",
       " '小学生',\n",
       " '开始',\n",
       " '用',\n",
       " '这种',\n",
       " '方式',\n",
       " '贵州',\n",
       " '多彩',\n",
       " '绚丽',\n",
       " '的',\n",
       " '民族',\n",
       " '风凯莉包',\n",
       " '文艺',\n",
       " '女',\n",
       " '青年',\n",
       " '必备',\n",
       " '去',\n",
       " '泰国',\n",
       " '买',\n",
       " '什么',\n",
       " '好',\n",
       " '？',\n",
       " '2018',\n",
       " '中级会计',\n",
       " '考试',\n",
       " '每日',\n",
       " '一练',\n",
       " '0509',\n",
       " '抱娃',\n",
       " '姿势',\n",
       " '不',\n",
       " '正确',\n",
       " '，',\n",
       " '小心',\n",
       " '患熊黛林',\n",
       " '同款',\n",
       " '“',\n",
       " '妈妈',\n",
       " '手',\n",
       " '”',\n",
       " '崩溃',\n",
       " '！',\n",
       " '火车站',\n",
       " '管理员',\n",
       " '睡',\n",
       " '过头',\n",
       " '：',\n",
       " '上百人',\n",
       " '眼睁睁',\n",
       " '看',\n",
       " '空车',\n",
       " '驶出',\n",
       " '一辆车',\n",
       " '的',\n",
       " '寿命',\n",
       " '到底',\n",
       " '多长',\n",
       " '，',\n",
       " '最',\n",
       " '多',\n",
       " '可以',\n",
       " '开',\n",
       " '多久',\n",
       " '？',\n",
       " '听说',\n",
       " '最近',\n",
       " '联通',\n",
       " '搅乱',\n",
       " '了',\n",
       " '北京',\n",
       " '宽带',\n",
       " '市场',\n",
       " '价格',\n",
       " '，',\n",
       " '比小',\n",
       " '运营商',\n",
       " '的',\n",
       " '还',\n",
       " '低',\n",
       " '？',\n",
       " '我',\n",
       " '的',\n",
       " '焕驰',\n",
       " '前',\n",
       " '减震',\n",
       " '有',\n",
       " '异响',\n",
       " '，',\n",
       " '过',\n",
       " '凹凸不平',\n",
       " '的',\n",
       " '时候',\n",
       " '会',\n",
       " '咣当',\n",
       " '一声',\n",
       " '，',\n",
       " '是',\n",
       " '什么',\n",
       " '原因',\n",
       " '？',\n",
       " '如何',\n",
       " '看待',\n",
       " '人民日报',\n",
       " '发文',\n",
       " '痛批',\n",
       " '“',\n",
       " '沉睡',\n",
       " '中',\n",
       " '的',\n",
       " '大学生',\n",
       " ':',\n",
       " '你',\n",
       " '不',\n",
       " '失业',\n",
       " '，',\n",
       " '天理难容',\n",
       " '！',\n",
       " '”',\n",
       " '这件',\n",
       " '事',\n",
       " '南京',\n",
       " '晓庄',\n",
       " '五年',\n",
       " '一贯制',\n",
       " '专转本',\n",
       " '培训',\n",
       " '简章',\n",
       " '2019',\n",
       " '年',\n",
       " '博大',\n",
       " '五年制',\n",
       " '专转本',\n",
       " '辅导',\n",
       " '中国',\n",
       " '传媒大学',\n",
       " '：',\n",
       " '平昌',\n",
       " '冬奥会',\n",
       " '闭幕式',\n",
       " '“',\n",
       " '北京',\n",
       " '八分钟',\n",
       " '”',\n",
       " '主创',\n",
       " '交流会',\n",
       " '举行',\n",
       " '哪种',\n",
       " 'POS',\n",
       " '机',\n",
       " '适合',\n",
       " '养卡',\n",
       " '？',\n",
       " '《',\n",
       " '复仇者',\n",
       " '联盟',\n",
       " '3',\n",
       " '》',\n",
       " '中',\n",
       " '奇异',\n",
       " '博士',\n",
       " '为什么',\n",
       " '不用',\n",
       " '时间',\n",
       " '宝石',\n",
       " '跟',\n",
       " '灭霸',\n",
       " '谈判',\n",
       " '？',\n",
       " '张鹤伦',\n",
       " '被',\n",
       " '女',\n",
       " '粉丝',\n",
       " '拿',\n",
       " '水枪',\n",
       " '对射',\n",
       " '：',\n",
       " '这个',\n",
       " '话筒',\n",
       " '5',\n",
       " '千',\n",
       " '多亿',\n",
       " '，',\n",
       " '赔钱',\n",
       " '驾驶证',\n",
       " '考试',\n",
       " '预约',\n",
       " '是',\n",
       " '按',\n",
       " '什么',\n",
       " '规则',\n",
       " '进行',\n",
       " '排序',\n",
       " '的',\n",
       " '？',\n",
       " '为',\n",
       " '创业',\n",
       " '的',\n",
       " '你',\n",
       " '准备',\n",
       " '的',\n",
       " '（',\n",
       " '2018',\n",
       " '年',\n",
       " '新',\n",
       " '商机',\n",
       " '）',\n",
       " '都',\n",
       " '说',\n",
       " '北京',\n",
       " '叫',\n",
       " '帝都',\n",
       " '，',\n",
       " '上海',\n",
       " '叫',\n",
       " '魔',\n",
       " '都',\n",
       " '，',\n",
       " '那',\n",
       " '深圳',\n",
       " '叫',\n",
       " '什么',\n",
       " '？',\n",
       " '阿里巴巴',\n",
       " '区块',\n",
       " '链',\n",
       " '专利',\n",
       " '排名',\n",
       " '全球',\n",
       " '第一',\n",
       " '，',\n",
       " '阿里',\n",
       " '都',\n",
       " '有',\n",
       " '哪些',\n",
       " '区块',\n",
       " '链',\n",
       " '应用',\n",
       " '？',\n",
       " '揭秘',\n",
       " '您',\n",
       " '身边',\n",
       " '十大类',\n",
       " '30',\n",
       " '种',\n",
       " '轻松',\n",
       " '赚钱',\n",
       " '被动',\n",
       " '收入',\n",
       " '的',\n",
       " '生意',\n",
       " '机会',\n",
       " '以色列',\n",
       " '总理',\n",
       " '将',\n",
       " '出席',\n",
       " '俄罗斯',\n",
       " '阅兵式',\n",
       " '5',\n",
       " '月',\n",
       " '9',\n",
       " '日',\n",
       " '会晤',\n",
       " '普京',\n",
       " '省',\n",
       " '“',\n",
       " '双创',\n",
       " '计划',\n",
       " '”',\n",
       " '项目',\n",
       " '申报',\n",
       " '工作',\n",
       " '启动',\n",
       " '军用',\n",
       " '匕首',\n",
       " '在',\n",
       " '关键时刻',\n",
       " '能',\n",
       " '发挥',\n",
       " '哪些',\n",
       " '作用',\n",
       " '？',\n",
       " '拥抱',\n",
       " '编辑',\n",
       " '3.0',\n",
       " '时代',\n",
       " '！',\n",
       " '内容',\n",
       " '升级',\n",
       " '为',\n",
       " '产品',\n",
       " '海内外',\n",
       " '媒体',\n",
       " '如何',\n",
       " '规划',\n",
       " '下个',\n",
       " '十年',\n",
       " '？',\n",
       " '银行',\n",
       " '“',\n",
       " '恼羞成怒',\n",
       " '”',\n",
       " '：',\n",
       " '在',\n",
       " '微信',\n",
       " '支付宝',\n",
       " '上',\n",
       " '借款',\n",
       " '两次',\n",
       " '，',\n",
       " '不',\n",
       " '给',\n",
       " '贷款',\n",
       " '2018',\n",
       " '年',\n",
       " '教育',\n",
       " '科技',\n",
       " '技术',\n",
       " '新',\n",
       " '主题',\n",
       " '—',\n",
       " '—',\n",
       " 'AR',\n",
       " '+',\n",
       " '教育',\n",
       " '航母',\n",
       " '受到',\n",
       " '多少',\n",
       " '攻击',\n",
       " '才',\n",
       " '会',\n",
       " '被',\n",
       " '击沉',\n",
       " '？',\n",
       " '中国',\n",
       " '的',\n",
       " '东风',\n",
       " '家族',\n",
       " '让',\n",
       " '你',\n",
       " '刮目相看',\n",
       " '慕容',\n",
       " '云海',\n",
       " '刁难',\n",
       " '楚雨',\n",
       " '荨',\n",
       " '，',\n",
       " '买',\n",
       " '矿泉水',\n",
       " '一下子',\n",
       " '买',\n",
       " '500',\n",
       " '块钱',\n",
       " '的',\n",
       " '，',\n",
       " '直接',\n",
       " '被',\n",
       " '累死',\n",
       " '非洲',\n",
       " '兄弟',\n",
       " '拿',\n",
       " 'AK',\n",
       " '技巧',\n",
       " '，',\n",
       " '不靠',\n",
       " '技术',\n",
       " '全靠',\n",
       " '信仰',\n",
       " 'PS4',\n",
       " '比',\n",
       " 'SWITCH',\n",
       " '销量',\n",
       " '高',\n",
       " '，',\n",
       " '而且',\n",
       " '游戏',\n",
       " '画面',\n",
       " '好',\n",
       " '，',\n",
       " '为什么',\n",
       " '还有',\n",
       " '人买',\n",
       " 'SWITCH',\n",
       " '？',\n",
       " '《',\n",
       " '我',\n",
       " '的',\n",
       " '世界',\n",
       " '》',\n",
       " '问答题',\n",
       " '：',\n",
       " '这',\n",
       " '几个',\n",
       " '头颅',\n",
       " '都',\n",
       " '有',\n",
       " '什么',\n",
       " '作用',\n",
       " '呢',\n",
       " '？',\n",
       " '答对',\n",
       " '的',\n",
       " '都',\n",
       " '是',\n",
       " '在',\n",
       " '中国',\n",
       " '古代',\n",
       " '，',\n",
       " '珍珠',\n",
       " '是从',\n",
       " '什么',\n",
       " '时期',\n",
       " '开始',\n",
       " '作为',\n",
       " '首饰',\n",
       " '使用',\n",
       " '的',\n",
       " '？',\n",
       " '少女',\n",
       " '患',\n",
       " '怪病',\n",
       " '，',\n",
       " '一',\n",
       " '激动',\n",
       " '就',\n",
       " '血流',\n",
       " '不止',\n",
       " '，',\n",
       " '只能靠',\n",
       " '药物',\n",
       " '止血',\n",
       " '！',\n",
       " '地球',\n",
       " '这是',\n",
       " '怎么',\n",
       " '了',\n",
       " '？',\n",
       " '美国',\n",
       " '夏威夷',\n",
       " '群岛',\n",
       " '突发',\n",
       " '6.9',\n",
       " '级',\n",
       " '地震',\n",
       " '，',\n",
       " '游客',\n",
       " '紧急',\n",
       " '疏散',\n",
       " '！',\n",
       " '海航',\n",
       " '资本',\n",
       " '助力',\n",
       " '海南',\n",
       " '自贸区',\n",
       " '建设',\n",
       " '为何',\n",
       " '联合国',\n",
       " '维和部队',\n",
       " '不去',\n",
       " '打击',\n",
       " 'ISIS',\n",
       " '？',\n",
       " '王者',\n",
       " '荣耀',\n",
       " '：',\n",
       " '李白',\n",
       " '和',\n",
       " '韩信',\n",
       " '还',\n",
       " '没',\n",
       " '加强',\n",
       " '这个',\n",
       " '人民币',\n",
       " '战士',\n",
       " '却',\n",
       " '先',\n",
       " '加强',\n",
       " '了',\n",
       " '，',\n",
       " '悲哀',\n",
       " '！',\n",
       " '军事',\n",
       " '才能',\n",
       " '“',\n",
       " '平庸',\n",
       " '”',\n",
       " '的',\n",
       " '华盛顿',\n",
       " '，',\n",
       " '为何',\n",
       " '能',\n",
       " '打赢',\n",
       " '独立战争',\n",
       " '（',\n",
       " '一',\n",
       " '）',\n",
       " '？',\n",
       " '济南',\n",
       " '堵车',\n",
       " '是因为',\n",
       " '道路',\n",
       " '规划',\n",
       " '不好',\n",
       " '吗',\n",
       " '？',\n",
       " '关于',\n",
       " '人工智能',\n",
       " '方面',\n",
       " '的',\n",
       " '研究生',\n",
       " '专业',\n",
       " '有',\n",
       " '哪些',\n",
       " '？',\n",
       " '2018',\n",
       " '年',\n",
       " '南沙',\n",
       " '公寓',\n",
       " '是否',\n",
       " '值得',\n",
       " '投资',\n",
       " '《',\n",
       " '泡沫',\n",
       " '之夏',\n",
       " '》',\n",
       " '22',\n",
       " '点',\n",
       " '独播',\n",
       " '！',\n",
       " '秦',\n",
       " '俊杰',\n",
       " '霸气',\n",
       " '狂',\n",
       " ...]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d2b13003",
   "metadata": {},
   "outputs": [],
   "source": [
    "words_count = Counter(all_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "5cd836bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('，', 34665),\n",
       " ('？', 23205),\n",
       " ('的', 21634),\n",
       " ('！', 9332),\n",
       " ('：', 7684),\n",
       " ('了', 6694),\n",
       " ('是', 5824),\n",
       " ('“', 5551),\n",
       " ('”', 5471),\n",
       " ('你', 4839),\n",
       " ('有', 4042),\n",
       " ('在', 3987),\n",
       " ('吗', 3686),\n",
       " ('什么', 3059),\n",
       " ('中国', 2930),\n",
       " ('为什么', 2790),\n",
       " ('如何', 2679),\n",
       " ('都', 2481),\n",
       " ('和', 2447),\n",
       " ('人', 2260),\n",
       " ('怎么', 2175),\n",
       " ('《', 2137),\n",
       " ('被', 2130),\n",
       " ('不', 2127),\n",
       " ('》', 2125),\n",
       " ('、', 1840),\n",
       " ('会', 1769),\n",
       " ('对', 1757),\n",
       " ('我', 1755),\n",
       " ('美国', 1698),\n",
       " ('年', 1601),\n",
       " ('上', 1477),\n",
       " ('—', 1457),\n",
       " ('看', 1441),\n",
       " ('最', 1440),\n",
       " ('能', 1415),\n",
       " ('这', 1399),\n",
       " ('一个', 1291),\n",
       " ('要', 1272),\n",
       " ('好', 1271),\n",
       " ('还', 1267),\n",
       " ('就', 1256),\n",
       " ('将', 1255),\n",
       " ('后', 1251),\n",
       " ('与', 1234),\n",
       " ('大', 1232),\n",
       " ('中', 1184),\n",
       " ('让', 1180),\n",
       " ('哪些', 1173),\n",
       " ('5', 1064),\n",
       " ('他', 1060),\n",
       " ('世界', 1005),\n",
       " ('却', 996),\n",
       " ('去', 988),\n",
       " ('新', 981),\n",
       " ('到', 981),\n",
       " ('月', 980),\n",
       " ('2018', 972),\n",
       " ('谁', 969),\n",
       " ('可以', 928),\n",
       " ('多', 926),\n",
       " ('说', 924),\n",
       " ('也', 913),\n",
       " ('又', 906),\n",
       " ('上联', 885),\n",
       " ('个', 871),\n",
       " ('来', 863),\n",
       " ('网友', 852),\n",
       " ('如果', 847),\n",
       " ('手机', 844),\n",
       " ('下联', 838),\n",
       " ('日本', 819),\n",
       " ('做', 818),\n",
       " ('用', 813),\n",
       " ('没有', 810),\n",
       " ('还是', 808),\n",
       " ('万', 807),\n",
       " ('现在', 789),\n",
       " ('为', 780),\n",
       " ('农村', 775),\n",
       " ('为何', 739),\n",
       " ('买', 731),\n",
       " ('…', 725),\n",
       " ('「', 720),\n",
       " ('」', 717),\n",
       " ('给', 711),\n",
       " ('这个', 710),\n",
       " ('3', 695),\n",
       " ('把', 686),\n",
       " ('岁', 672),\n",
       " ('-', 660),\n",
       " ('怎样', 659),\n",
       " ('呢', 645),\n",
       " ('多少', 641),\n",
       " ('一', 630),\n",
       " ('｜', 630),\n",
       " ('10', 628),\n",
       " ('（', 628),\n",
       " ('很', 623),\n",
       " ('）', 621),\n",
       " ('国家', 613),\n",
       " ('到底', 610),\n",
       " ('哪个', 610),\n",
       " ('知道', 606),\n",
       " ('更', 596),\n",
       " ('她', 592),\n",
       " ('俄罗斯', 581),\n",
       " ('4', 580),\n",
       " ('真的', 580),\n",
       " ('想', 575),\n",
       " ('日', 567),\n",
       " ('游戏', 565),\n",
       " ('我们', 564),\n",
       " ('小', 564),\n",
       " ('过', 561),\n",
       " ('下', 551),\n",
       " ('自己', 542),\n",
       " ('里', 538),\n",
       " ('这些', 535),\n",
       " ('2', 523),\n",
       " ('怎么样', 521),\n",
       " ('钱', 516),\n",
       " ('从', 492),\n",
       " ('城市', 485),\n",
       " ('未来', 481),\n",
       " ('这么', 480),\n",
       " ('1', 472),\n",
       " ('亿', 471),\n",
       " ('王者', 471),\n",
       " ('成为', 471),\n",
       " ('公司', 466),\n",
       " ('没', 464),\n",
       " ('汽车', 464),\n",
       " ('孩子', 463),\n",
       " ('发展', 463),\n",
       " ('吃', 459),\n",
       " ('打', 457),\n",
       " ('车', 456),\n",
       " ('8', 447),\n",
       " ('看待', 441),\n",
       " ('荣耀', 441),\n",
       " ('伊朗', 436),\n",
       " ('再', 436),\n",
       " ('活动', 434),\n",
       " ('该', 431),\n",
       " ('6', 430),\n",
       " ('以色列', 417),\n",
       " ('比', 417),\n",
       " ('高', 416),\n",
       " ('旅游', 416),\n",
       " ('着', 415),\n",
       " ('这样', 414),\n",
       " ('出', 414),\n",
       " ('才', 409),\n",
       " ('第一', 406),\n",
       " ('时', 406),\n",
       " ('那么', 403),\n",
       " ('不是', 401),\n",
       " ('全球', 396),\n",
       " ('前', 387),\n",
       " ('美', 385),\n",
       " ('市场', 384),\n",
       " ('之', 382),\n",
       " ('企业', 375),\n",
       " ('英雄', 374),\n",
       " ('20', 371),\n",
       " ('生活', 366),\n",
       " ('或', 365),\n",
       " ('还有', 363),\n",
       " ('投资', 362),\n",
       " ('只', 360),\n",
       " ('特朗普', 357),\n",
       " ('·', 355),\n",
       " ('就是', 348),\n",
       " ('小米', 345),\n",
       " ('不能', 342),\n",
       " ('哪', 339),\n",
       " ('需要', 339),\n",
       " ('而', 339),\n",
       " ('印度', 337),\n",
       " ('技术', 335),\n",
       " ('评价', 332),\n",
       " ('7', 329),\n",
       " ('等', 328),\n",
       " ('选择', 328),\n",
       " ('+', 327),\n",
       " ('科技', 324),\n",
       " ('链', 323),\n",
       " ('教育', 323),\n",
       " ('已', 319),\n",
       " ('他们', 318),\n",
       " ('它', 316),\n",
       " ('可能', 315),\n",
       " ('喜欢', 310),\n",
       " ('北京', 308),\n",
       " ('当', 306),\n",
       " ('区块', 305),\n",
       " ('国际', 305),\n",
       " ('上市', 305),\n",
       " ('马云', 304),\n",
       " ('联想', 301),\n",
       " ('但', 300),\n",
       " ('玩', 300),\n",
       " ('成', 300),\n",
       " ('吧', 299),\n",
       " ('房子', 297),\n",
       " ('学生', 296),\n",
       " ('行业', 296),\n",
       " ('创业', 293),\n",
       " ('哪里', 291),\n",
       " ('元', 291),\n",
       " ('协议', 290),\n",
       " ('开', 290),\n",
       " ('品牌', 290),\n",
       " ('走', 289),\n",
       " ('9', 288),\n",
       " ('怎么办', 286),\n",
       " ('房价', 285),\n",
       " ('地方', 285),\n",
       " ('影响', 285),\n",
       " ('带', 285),\n",
       " ('得', 283),\n",
       " ('最后', 282),\n",
       " ('不会', 282),\n",
       " ('们', 281),\n",
       " ('原因', 280),\n",
       " ('成功', 280),\n",
       " ('最大', 279),\n",
       " ('太', 279),\n",
       " ('即将', 278),\n",
       " ('是否', 277),\n",
       " ('价格', 276),\n",
       " ('老师', 274),\n",
       " ('腾讯', 274),\n",
       " ('经济', 268),\n",
       " ('可', 266),\n",
       " ('应该', 265),\n",
       " ('卖', 264),\n",
       " ('看看', 263),\n",
       " ('比较', 262),\n",
       " ('叙利亚', 262),\n",
       " ('时候', 261),\n",
       " ('银行', 261),\n",
       " ('大家', 261),\n",
       " ('拿', 260),\n",
       " ('十年', 260),\n",
       " ('问题', 259),\n",
       " ('故事', 258),\n",
       " ('值得', 258),\n",
       " ('微信', 257),\n",
       " ('时代', 255),\n",
       " ('全国', 252),\n",
       " ('历史', 252),\n",
       " ('项目', 252),\n",
       " ('开始', 251),\n",
       " ('工作', 251),\n",
       " ('华为', 251),\n",
       " ('已经', 250),\n",
       " ('上海', 249),\n",
       " ('文化', 249),\n",
       " ('老', 248),\n",
       " ('你们', 247),\n",
       " ('万元', 246),\n",
       " ('推荐', 246),\n",
       " ('。', 245),\n",
       " ('买房', 241),\n",
       " ('联盟', 241),\n",
       " ('普京', 241),\n",
       " ('很多', 241),\n",
       " ('求生', 240),\n",
       " ('绝地', 239),\n",
       " ('退出', 238),\n",
       " ('韩国', 238),\n",
       " ('升级', 237),\n",
       " ('跑', 234),\n",
       " ('火箭', 228),\n",
       " ('苹果', 227),\n",
       " ('发布', 227),\n",
       " ('只有', 226),\n",
       " ('号', 225),\n",
       " ('向', 225),\n",
       " ('出现', 225),\n",
       " ('厉害', 225),\n",
       " ('今年', 224),\n",
       " ('比赛', 223),\n",
       " ('什么样', 223),\n",
       " ('国内', 222),\n",
       " ('平台', 221),\n",
       " ('跟', 221),\n",
       " ('30', 221),\n",
       " ('最好', 221),\n",
       " ('分', 219),\n",
       " ('12', 218),\n",
       " ('快', 213),\n",
       " ('币', 213),\n",
       " ('体验', 213),\n",
       " ('汶川', 213),\n",
       " ('家', 213),\n",
       " ('适合', 211),\n",
       " ('出席', 211)]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(words_count.items(), key=lambda kv:kv[1], reverse=True)[:300]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "d28b35f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[34665,\n",
       " 23205,\n",
       " 21634,\n",
       " 9332,\n",
       " 7684,\n",
       " 6694,\n",
       " 5824,\n",
       " 5551,\n",
       " 5471,\n",
       " 4839,\n",
       " 4042,\n",
       " 3987,\n",
       " 3686,\n",
       " 3059,\n",
       " 2930,\n",
       " 2790,\n",
       " 2679,\n",
       " 2481,\n",
       " 2447,\n",
       " 2260,\n",
       " 2175,\n",
       " 2137,\n",
       " 2130,\n",
       " 2127,\n",
       " 2125,\n",
       " 1840,\n",
       " 1769,\n",
       " 1757,\n",
       " 1755,\n",
       " 1698]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(words_count.values(), reverse=True)[:30]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bc0f366",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "05353c5e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x2227cbd7ee0>]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAhGUlEQVR4nO3deXzU9Z3H8dcnMzlICOFIuAnhRhAPjAqCVlC8anFb7VZbr1aX1brbuu7aat1aa7ftautWu7pVqq3Wq91VUUoV7wNUQEBATjnlhnDfkITP/jE/MMRAJskkv5nM+/l4zIPf/H6/+c37wUze+eX7+81vzN0REZH0kBF2ABERaToqfRGRNKLSFxFJIyp9EZE0otIXEUkjKn0RkTQSjWclM1sB7AQqgQp3L622/GzgJWB5MOsFd787YSlFRCQh4ir9wAh333SM5ZPc/eKGBhIRkcaj4R0RkTQS756+A6+ZmQOPuPvYGtYZamazgbXAv7n7vGNtsLCw0EtKSuoUVkQk3c2YMWOTuxfV9/Hxlv5wd19jZu2B181sobu/V2X5TKC7u+8ys4uAF4E+1TdiZmOAMQDFxcVMnz69vrlFRNKSmX3WkMfHNbzj7muCfzcC44DTqi3f4e67gumXgUwzK6xhO2PdvdTdS4uK6v2LSkRE6qnW0jezPDPLPzQNnAfMrbZORzOzYPq0YLubEx9XREQaIp7hnQ7AuKDTo8Az7j7RzG4AcPeHgcuAG82sAtgLXO66fKeISNKptfTdfRlwYg3zH64y/SDwYGKjiYhIoumUTRGRNKLSFxFJIyp9EZE0knKlv2j9Tn75ygJ27isPO4qISMpJudJftWUPj7y7jE837Aw7iohIykm50j++SwEAr8/fGHISEZHUk3Kl37Egh7ysCK/MXRd2FBGRlJNypQ9wes92fLZ5D9v3aFxfRKQuUrL0vza4CwDvLz3W5f1FRKS6lCz94b1j13KbMGdtyElERFJLSpZ+69wsStrlMnXZlrCjiIiklJQsfYAhPduxefcBynbuDzuKiEjKSNnSP29gB0BDPCIidZGypX9mn9iXsDw1pUFfIiMiklZStvQzIxmc2LWApWW7Wb5pd9hxRERSQsqWPsBPRg8E4N6JC0NOIiKSGlK69AcXt6F9fjavzF3PngMVYccREUl6KV36AP/4pV4AvPixDuiKiNQm5Uv/itO6AfD4B8tDTiIikvxSvvRzs6IM7NyKTzfsYvd+DfGIiBxLypc+wLeH9QDgkfeWhZxERCS5NYvS/+rJsQuw/fbNxSEnERFJbs2i9CMZxqgBsU/ovrNIX64iInI0cZW+ma0ws0/MbJaZTa9huZnZb81siZnNMbPBiY96bD+75HgAHtDevojIUUXrsO4Idz/aBewvBPoEt9OB3wX/NpmOBTn0LMzj45Xb2HugkhZZkaZ8ehGRlJCo4Z1LgD95zBSgtZl1StC243b10O4ATF6iL1cREalJvKXvwGtmNsPMxtSwvAuwqsr91cG8I5jZGDObbmbTy8rK6p62FhccH/s98+KsNQnftohIcxBv6Q9398HEhnFuMrOz6vNk7j7W3UvdvbSoqKg+mzimjgU5tM7N5L1PE/8LRUSkOYir9N19TfDvRmAccFq1VdYA3arc7xrMa3Ij+7Vn574KVm3ZE8bTi4gktVpL38zyzCz/0DRwHjC32mrjgauDs3iGANvdfV3C08bhstKuADw1VdfZFxGpLp6zdzoA48zs0PrPuPtEM7sBwN0fBl4GLgKWAHuAbzdO3NoN7dkOgClLN4cVQUQkadVa+u6+DDixhvkPV5l24KbERqsfM+MrJ3bmr7PXUl55kMxIs/j8mYhIQjTLRhzQqRUAby7YEHISEZHk0ixL/9LBsbNF739Dn84VEamqWZZ++1Y5dC7IYeH6neyvqAw7johI0miWpQ9w/Zk9AfhAB3RFRA5rtqV//vEdAXh+xuqQk4iIJI9mW/pdWrcgK5rBhDnr2HtAQzwiItCMSx/g5nP7AHDTMzNDTiIikhyadenfcFYvAN5auJGtuw+EnEZEJHzNuvQzMox7Lz0BgF+9tijkNCIi4WvWpQ/w9eBaPM9MXRlyEhGR8DX70jczvjwodp39Oau3hRtGRCRkzb70Aa47swcA1z/xha/3FRFJK2lR+oOL29C7fUs27tzPovU7w44jIhKatCh9gHsvix3QvfHpGSEnEREJT9qU/uDiNuRnR1lWtltj+yKSttKm9AEeueoUAEY/+D6xrwAQEUkvaVX6Z/QuZES/2Bey/1rn7YtIGkqr0gf4728OBuCht5dSUXkw5DQiIk0r7Uq/ZXaU64fHTuH8/aTlIacREWlaaVf6ALde0A+AeyYuZF+5rsApIukjLUs/Oxrh66fELs9wx7i5IacREWk6aVn6APcEF2J7fuZqtu8tDzmNiEjTiLv0zSxiZh+b2YQall1rZmVmNiu4XZ/YmImXkWHccdFxAHzr0Sk6hVNE0kJd9vS/Dyw4xvK/uPtJwe3RBuZqEt8eVkJWNIO5a3bw5JTPwo4jItLo4ip9M+sKfBlIiTKPVzSSwdv/djYAd740jwMVOoVTRJq3ePf07wd+AByrFS81szlm9pyZdWtwsibSpXULvjMsdgrn9579OOQ0IiKNq9bSN7OLgY3ufqwrlf0VKHH3E4DXgSeOsq0xZjbdzKaXlZXVK3BjuO3C/gBMnLeev81ZF3IaEZHGE8+e/jBgtJmtAP4MjDSzp6qu4O6b3X1/cPdR4JSaNuTuY9291N1Li4qKGhA7sbKiGfzfDUOB2Jeo66CuiDRXtZa+u9/u7l3dvQS4HHjL3a+suo6ZdapydzTHPuCblE4tactXTuwMwP1vLA45jYhI46j3efpmdreZjQ7ufs/M5pnZbOB7wLWJCNfUfvKVAQA88OZiHdQVkWapTqXv7u+4+8XB9J3uPj6Yvt3dB7r7ie4+wt0XNkbYxlbYMptbRvUF4M6X9EldEWl+0vYTuUcz5qyeAPz5o1UsWLcj5DQiIoml0q8mJzPCzy4ZCMCFD0yiXJdfFpFmRKVfg6uGlvB3J8UO6n7lvydTeVBn84hI86DSP4qff3UQ+dlRFq7fybV/nBZ2HBGRhFDpH0VedpR3fzACgEmLN/HIu0tDTiQi0nAq/WNom5fFqzefBcAvX1nI1t0HQk4kItIwKv1a9OuYzy+/NgiAM+99m/0V+qYtEUldKv04/H1pN/p3zGfX/gr6/ftEFb+IpCyVfhwiGca47w5jcHFrAK79w0cc1Bk9IpKCVPpxapEV4dkxQwD4cNlm7p4wP+REIiJ1p9Kvg+xohLf+9UsAPP7BCh56e0nIiURE6kalX0c9i1ry/I1nAPCrVxexZOOukBOJiMRPpV8Pp3RvwzPXnw7ARQ9MYs22vSEnEhGJj0q/ns7oXcjZ/Yo4UHmQy373gQ7sikhKUOk3wB+vPZUOrbJZt30fr83fEHYcEZFaqfQbwMx47obY+P4NT81g1ZY9IScSETk2lX4DdWubyzdPLwZg5H3vsHnX/loeISISHpV+AvzHJcczuLg15ZXOqN+8x8595WFHEhGpkUo/ATIyjGf+YQg9CvPYsvsAVz42jTmrt4UdS0TkC1T6CZKTGWH8Pw3j3OPaM2f1Nm7+8yze0MFdEUkyKv0Eys/J5NFrTuXK07uzfsc+rv/TdF6btz7sWCIih6n0G8HP/u54bhrRG4CfjJ/Hs9NWhpxIRCQm7tI3s4iZfWxmE2pYlm1mfzGzJWY21cxKEpoyBd00ojc/vKA/2/aUc/sLn/DWQg31iEj46rKn/31gwVGWXQdsdffewG+AexoarDm48exe/OYbJwLwncen6+CuiIQurtI3s67Al4FHj7LKJcATwfRzwDlmZg2Pl/ouOL4Td31lAACjH3yflZv1AS4RCU+8e/r3Az8ADh5leRdgFYC7VwDbgXYNDddcXHNGCVcN6Q7Aub95V9fpEZHQ1Fr6ZnYxsNHdZzT0ycxsjJlNN7PpZWVlDd1cyjAz/v3i4zh/YAcOVBzkrr/OCzuSiKSpePb0hwGjzWwF8GdgpJk9VW2dNUA3ADOLAgXA5uobcvex7l7q7qVFRUUNCp5qsqMRfvX12Pj+01NXcse4T0JOJCLpqNbSd/fb3b2ru5cAlwNvufuV1VYbD1wTTF8WrKMxjGpa5WTy9PWn06Mwj6enrmTse0vDjiQiaabe5+mb2d1mNjq4+xjQzsyWALcAtyUiXHM0rHchP7qoPwAPvb2UcR+vDjmRiKQTC2uHvLS01KdPnx7KcyeDZ6au5M6X5uLAo9eUckr3NrTKyQw7logkOTOb4e6l9X28PpEbkm+eXswvvzaIyoPOt//4Efe9uohKndUjIo1MpR+irw3uyoR/Hk7Pojye+PAzBt31Kqu36jx+EWk8Kv0QRTKM47sUcM+lJ/Ct04vZc6CSr/7PB5Tt1BexiEjjUOkngVNL2vKji46jZ2EeZTv3c/8bn7J4w86wY4lIM6TSTxJ52VHG3TSM3KxI7Dz+F+eyv6Iy7Fgi0syo9JNIQYtMPrrjXM4f2IFpy7dw2s/fZPf+irBjiUgzotJPMnnZUW678DguHdyV7XvLueGpGfxh8vKwY4lIMxENO4B8UY/CPG4+tw/LNu1i7prtzF2znd7tWzKgcysKW2aHHU9EUpj29JNUt7a5jPvuMK4b3oOte8q5+g/TuO15Xa9HRBpGpZ/kxpzVixe+ewanlbRl0uIyvvzbSTqzR0TqTaWf5LKiGQwubsONI3pxZp9C5q3dwR/eX870FVvCjiYiKUilnyJG9GvPg98cTMvsKM9OW8UVv59CeeXRvtNGRKRmKv0UkpMZ4f0fjuRfzu1LeaVz7n+9y+gHJ7N5lz7BKyLx0dk7KaYgN5PLSruycsseNuzYx+Qlm/jzR6sY0LkVQ3u2IyczEnZEEUliurRyClu9dQ/D73n78P3/+LvjuTL4Ll4RaZ50aeU01rVNLu/dOoLx/zSMDINHJy3jusc/YsG6HWFHE5EkpeGdFFfcLpdicrnitGLmrt3Bmws30rdjPoUts8nNipCXrZdYRD6n4Z1mZtBdr7JzX+x6PTmZGXxw2zm0zcsKOZWIJEpDh3e0G9jM/P7qUhZv3MWi9Tt4aspKfj9pGaeWtGFk/w5hRxORJKAx/WZmSM92XDWkO1cPLSGSYfzunaVc98R09hzQ1TpFRKXfbPXtkM/sn5zHjy8egDv845MzuO7xj3hr4Yawo4lIiDS804y1zI7ypb5FvNy9DVv3HGDJxl1kRTM01COSxnQgN418/eEPmLlyG3lZEW4Z1Zdrh/UIO5KI1FGjn6dvZjlmNs3MZpvZPDP7aQ3rXGtmZWY2K7hdX99A0ni+f05frhrSnYwMY9LiTazZtpdtew6EHUtEmlA8wzv7gZHuvsvMMoHJZvaKu0+ptt5f3P2fEh9REmV4n0KG9ylk/rrY+fxv/udbRDKMd289m65tcsOOJyJNoNbS99j4z67gbmZwC2dMSBLiF189npmfbWPppl088u4yXp+/gf4dW9GqRZSBnQvCjicijSiuA7lmFgFmAL2Bh9x9ag2rXWpmZwGfAv/i7qtq2M4YYAxAcXFxvUNLw/Run0/v9vks2Rgr/Z/+df7hZW/+65foVdQyxHQi0pjqdCDXzFoD44B/dve5Vea3A3a5+34z+0fgG+4+8ljb0oHc5DB3zXZ27qtgwbod3D1hPreM6kvfDi1p1zKbU0vahh1PRKpp6IHcOp+9Y2Z3Anvc/ddHWR4Btrj7MccJVPrJ5bPNu/nSr945Yt60H51D+1Y54QQSkRo1+mUYzKwIKHf3bWbWAhgF3FNtnU7uvi64OxpYUN9AEo7u7fKY/MMR7NxXwftLNvEff1vAS7PW0rl1C3IyMzirbxGZEX2WTyTVxTOm3wl4ItiDzwD+190nmNndwHR3Hw98z8xGAxXAFuDaxgosjefQGTyHvobx5y9//rv7sWtKOec4fahLJNXpw1lSo1Vb9rC3vJKNO/Zz5WNTuX54D87sWwTASV1bU5CbGXJCkfSkq2xKo+jWNrbX36V1C7IiGTw6eTmPTl4OwOWnduM/Lz0hzHgiUk8qfTmmvOwob9zyJcqCL1+/9bnZfLZ5D0vLYh/daN0ik3Yts8OMKCJ1oNKXWhW3y6W43ed7/pMWb+Kc+94FIDuawcwfj9I3dImkCP2kSp384quDmLlyKwAfrdjCU1NWsmX3AZW+SIrQT6rUSbe2uYfH+7OjEZ6aspJRv3mXDDMAMsy4a/RALjula5gxReQoVPpSb8P7FPK9kb3ZW155eN5TU1Yya9VWlb5IklLpS721zI5yy3n9jpg3cd563l5Yxg1PzgAgGjFuGdWXnrqej0hSUOlLQn15UGfeXriR5Zt2c9CdxRt3cVK31ip9kSSh0peEuu3C/tx2YX8AKioP0vuOV5izejuvzVsPQHZmhGG92hHVJR1EQqHSl0YTjWRQ2DKb8bPXMn722sPzf391KaMG6JIOImFQ6UujeuX7Z7Jhxz4Aynbt59t//Igtu/eHnEokfan0pVEV5WdTlB/7xO6h7+O9d+IiHn532eF1RvRrz51fGRBKPpF0o4FVaTIFLTK5aUQvhvUuZFCXAgZ1KaDi4EFeX7A+7GgiaUN7+tJkzIxbz+9/xLx/f/ETXpq1lrcWbjhifkGLTE7prm/uEkk0lb6Eqn1+Djv3VfCdx794me23/+1sehTmhZBKpPlS6Uuobjy7FyP6tedgle91+HjlVu7663y27D6g0hdJMJW+hCozksGgrkd+nfKB4Ju77n/jU9rnf/4dvSd2K+DqoSVNGU+k2VHpS9LpVdSS/h3zWVa2m2VluwHYvrec1+avV+mLNJBKX5JO27wsJt581hHz7p24kLHvLTvKI0QkXip9SQm5WREqDjoj73sHq7asRVaE333rlMOXfBaRo1PpS0q44PiOLN64i4qDfsT8HXvLmbR4EwvW7VDpi8RBpS8poXf7fB64/OQvzF9atotz7nv3iGv6i8jR1Vr6ZpYDvAdkB+s/5+4/qbZONvAn4BRgM/ANd1+R8LQi1eRlxd7Ctz3/CT8ZP+8Ly4f1KuShbw1u6lgiSSuePf39wEh332VmmcBkM3vF3adUWec6YKu79zazy4F7gG80Ql6RI3Rolc2t5/djY3BRt6qmLt/ClGWbQ0glkrxqLX13d2BXcDczuHm11S4B7gqmnwMeNDMLHivSaMyMm0b0rnHZz/82n6enrmziRCLJLa4xfTOLADOA3sBD7j612ipdgFUA7l5hZtuBdsCmBGYVqZMWmRH2HKjkzHvfqnF5u7xsnv2HIbTIijRxMpHwxFX67l4JnGRmrYFxZna8u8+t65OZ2RhgDEBxcXFdHy5SJxef2Jm12/dx8OAX/+BctXUPH63Yyrrte/VVjpJW6nT2jrtvM7O3gQuAqqW/BugGrDazKFBA7IBu9cePBcYClJaWauhHGlXfDvn8+usn1rhs4tx1fLRiK/vKDzZxKpFwxXP2ThFQHhR+C2AUsQO1VY0HrgE+BC4D3tJ4viSz7MzYkM4vXl5A69zMGtc5ubgN1w3v0ZSxRBpdPHv6nYAngnH9DOB/3X2Cmd0NTHf38cBjwJNmtgTYAlzeaIlFEuC4jq0Y1KWAtdv3snb73i8s37RzP5OXbFLpS7MTz9k7c4AvfCrG3e+sMr0P+Hpio4k0no4FOfz1n4cfdfkvXl7Anz5c0XSBRJqIPpErUoOcaAb7yg8yaXEZ9oWr/UAkwxjcvTXZUZ35I6lFpS9Sg3YtY1/mftVj0466zk9HD+SaM0qaKJFIYqj0RWrwzdOLGdS1gMoaTvd0h79/5EO27SkPIZlIw6j0RWqQGclgcHGboy7PimSwr0IXeZPUo9IXqYfsaAYvf7KOpRt3HXO9aMT43jl96N+xVRMlEzk2lb5IPYw+qTMzPtvKyi17jrnewvU7GdCplUpfkoZKX6Qefv7VQbWu4+70/NHLHKjQp34leWSEHUCkuTIzsqMZ7FfpSxLRnr5II8qORnhyymeMn7221nWvHlrCjWf3aoJUks5U+iKN6Nbz+zFn9bZa13tzwUY+WLpJpS+NTqUv0oiuHNId6F7ret945EON/UuT0Ji+SBLI0ti/NBHt6YskgexohKnrtjDyvnfiWr8wL5s/XXcaOZm69o/UjUpfJAl8a0gxOZnx/eG9Zttepq3Ywrrt++hRmNfIyaS5UemLJIER/dozol/7uNb925x13PTMTB0DkHrRmL5IismKxn5sVfpSH9rTF0kxh0r/8Q9W0LEgu06Pzc2K8p1hPWiRpWMB6UqlL5JiurfNpVVOlJdmranT4xyoPOgM7NyKs+McSpLmR6UvkmJKCvOYc9f5dX7cvLXb+fJvJ+vU0DSnMX2RNJEVif24l1eq9NOZSl8kTWRGdABYNLwjkjYOHQC+e8J87nvt03ptIxox/vNrJzC0V7tERpMmVGvpm1k34E9AB2LHgsa6+wPV1jkbeAlYHsx6wd3vTmhSEWmQTgU53Hh2Lzbu2F+vxx90Z9zHa5i9eptKP4XFs6dfAfyru880s3xghpm97u7zq603yd0vTnxEEUkEM+OHF/Sv9+MrD8ZKX8NDqa3WMX13X+fuM4PpncACoEtjBxOR5BLJMDJMB4JTXZ0O5JpZCXAyMLWGxUPNbLaZvWJmAxMRTkSSS2YkgwMq/ZQW94FcM2sJPA/c7O47qi2eCXR3911mdhHwItCnhm2MAcYAFBcX1zeziIQkK5LBM1NXMnHu+gZtJz8nypPfOZ02eVkJSibxiqv0zSyTWOE/7e4vVF9e9ZeAu79sZv9jZoXuvqnaemOBsQClpaXeoOQi0uRuHtWXT+L4JrBj2bBjPx8u28xnW/ao9EMQz9k7BjwGLHD3/zrKOh2BDe7uZnYasWGjzQlNKiKhu254jwZvY9LiMj5ctlnHBkISz57+MOAq4BMzmxXM+xFQDODuDwOXATeaWQWwF7jc3bUnLyJfkKlPBoeq1tJ398mA1bLOg8CDiQolIs3X56Wv/cIw6BO5ItKkMiOxfcg35m9g1ZY9Cdlmy+woF5/QiWhEV5apjUpfRJpU+/wcMiPGk1M+S+h2u7VtwSnd2yZ0m82RSl9EmlTHghxm/ngUe8srE7K9WSu3MebJGew9oGME8VDpi0iTy8/JJD8nMyHbKsyPfXtY+UGVfjw0ACYiKS0zI1ZjFTowHBeVvoiktMxo7MBwhU4BjYuGd0QkpUWDPf3X529g7fZ9jfY8x3XM54zehY22/aai0heRlFbYMovcrAgvfLyGFz6u25fF10X7/Gym3XFuo22/qaj0RSSltc7NYuaPRzXqF77fM3Ehf5uzrtG235RU+iKS8nIyI+RkRhpt+7mZkWZzzEAHckVEahGJGOUHm8fZQSp9EZFaZGZkUKnSFxFJD9GIUXnQaQ4XD9aYvohILaIZsc8CjPrNe8e+5HCcvnFqN64/s2cCtlR3Kn0RkVqcN7AjizbsojJBl3oobJmdkO3Uh0pfRKQWfTvk899XnBx2jITQmL6ISBpR6YuIpBGVvohIGlHpi4ikEZW+iEgaUemLiKQRlb6ISBpR6YuIpBEL61oSZlYGfFbPhxcCmxIYJ9GSOV8yZwPla4hkzgbJnS+Zs8GR+bq7e1F9NxRa6TeEmU1399KwcxxNMudL5mygfA2RzNkgufMlczZIbD4N74iIpBGVvohIGknV0h8bdoBaJHO+ZM4GytcQyZwNkjtfMmeDBOZLyTF9ERGpn1Td0xcRkXpIudI3swvMbJGZLTGz25rwef9gZhvNbG6VeW3N7HUzWxz82yaYb2b22yDjHDMbXOUx1wTrLzazaxKUrZuZvW1m881snpl9P1nymVmOmU0zs9lBtp8G83uY2dQgw1/MLCuYnx3cXxIsL6myrduD+YvM7PyGZquy3YiZfWxmE5Iw2woz+8TMZpnZ9GBe6K9rle22NrPnzGyhmS0ws6HJkM/M+gX/Z4duO8zs5mTIVmW7/xL8TMw1s2eDn5XGf++5e8rcgAiwFOgJZAGzgQFN9NxnAYOBuVXm3QvcFkzfBtwTTF8EvAIYMASYGsxvCywL/m0TTLdJQLZOwOBgOh/4FBiQDPmC52gZTGcCU4Pn/F/g8mD+w8CNwfR3gYeD6cuBvwTTA4LXOxvoEbwPIgl6bW8BngEmBPeTKdsKoLDavNBf1ypZngCuD6azgNbJlC/YfgRYD3RPlmxAF2A50KLKe+7apnjvJeQ/taluwFDg1Sr3bwdub8LnL+HI0l8EdAqmOwGLgulHgCuqrwdcATxSZf4R6yUw50vAqGTLB+QCM4HTiX3QJFr9dQVeBYYG09FgPav+Wlddr4GZugJvAiOBCcFzJUW2YFsr+GLpJ8XrChQQKy5LxnxVtnce8H4yZSNW+quI/TKJBu+985vivZdqwzuH/qMOWR3MC0sHd18XTK8HOgTTR8vZ6PmDP/tOJrZHnRT5guGTWcBG4HVieyPb3L2ihuc5nCFYvh1o11jZgPuBHwCHvvy0XRJlA3DgNTObYWZjgnlJ8boS27MsA/4YDI89amZ5SZTvkMuBZ4PppMjm7muAXwMrgXXE3kszaIL3XqqVftLy2K/ZUE+FMrOWwPPAze6+o+qyMPO5e6W7n0Rsr/o0oH8YOaozs4uBje4+I+wsxzDc3QcDFwI3mdlZVReG/L6LEhvy/J27nwzsJjZkcljYPxfBmPho4P+qLwszW3As4RJivzg7A3nABU3x3KlW+muAblXudw3mhWWDmXUCCP7dGMw/Ws5Gy29mmcQK/2l3fyHZ8gG4+zbgbWJ/trY2s2gNz3M4Q7C8ANjcSNmGAaPNbAXwZ2JDPA8kSTbg8B4h7r4RGEfsl2ayvK6rgdXuPjW4/xyxXwLJkg9ivyxnuvuG4H6yZDsXWO7uZe5eDrxA7P3Y6O+9VCv9j4A+wRHuLGJ/to0PMc944NDR/GuIjaUfmn91cEbAEGB78Cflq8B5ZtYm+E1/XjCvQczMgMeABe7+X8mUz8yKzKx1MN2C2LGGBcTK/7KjZDuU+TLgrWCPbDxweXAWQw+gDzCtIdnc/XZ37+ruJcTeS2+5+7eSIRuAmeWZWf6haWKvx1yS4HUFcPf1wCoz6xfMOgeYnyz5Alfw+dDOoQzJkG0lMMTMcoOf30P/d43/3kvUwZKmuhE7yv4psXHhO5rweZ8lNvZWTmwP5zpiY2pvAouBN4C2wboGPBRk/AQorbKd7wBLgtu3E5RtOLE/U+cAs4LbRcmQDzgB+DjINhe4M5jfM3hzLiH2p3d2MD8nuL8kWN6zyrbuCDIvAi5M8Ot7Np+fvZMU2YIcs4PbvEPv92R4Xats9yRgevD6vkjsDJekyEdsyGQzUFBlXlJkC7b7U2Bh8HPxJLEzcBr9vadP5IqIpJFUG94REZEGUOmLiKQRlb6ISBpR6YuIpBGVvohIGlHpi4ikEZW+iEgaUemLiKSR/wfcU0tkrmcRsgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(list(map(lambda n: np.log(n), sorted(words_count.values(), reverse=True)[300:8000])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "53a2813d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ldq\\AppData\\Local\\Temp\\ipykernel_17060\\2281528340.py:1: RuntimeWarning: divide by zero encountered in log\n",
      "  plt.plot(list(map(lambda n: np.log(np.log(n)), sorted(words_count.values(), reverse=True))))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x2227ccaa800>]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZ70lEQVR4nO3deZhU9Z3v8fe3qlf2pdlkEXBDcSUdFB2jooxgJuITvXM1N4lm4hBHnUkyyU1MzKhJZp4xTmK8LlcHE5KYa6IJM7kheZioCF4IBrQhrBqgQZQmKHuzNtDd3/tHnS6qm+5m6VN1qk59Xs9TT58653T9vlUUH378zjm/Y+6OiIjEXyLqAkREJDcU+CIiRUKBLyJSJBT4IiJFQoEvIlIkSqIuoCNVVVU+cuTIqMsQESkoS5Ys2e7uA9rblreBP3LkSGpqaqIuQ0SkoJjZux1t05COiEiRUOCLiBQJBb6ISJFQ4IuIFAkFvohIkVDgi4gUCQW+iEiRiF3g7z/UyKMvr2HZpt1RlyIikldiF/gNR5p4fG4tK+p2R12KiEheiV3gJ8wAaG7WjV1ERDLFN/CV9yIircQu8C14R826daOISCuxC/yWHr7yXkSktRgGfupnkxJfRKSVGAZ+yxi+Al9EJFNsA195LyLSWgwDP/VTp2WKiLQWw8DXaZkiIu2JXeBbSw9fYzoiIq3EMPANM3AFvohIK7ELfEgN6+i0TBGR1mIa+BrDFxFpK6aBbxrDFxFpI7aBr7wXEWktpoGv8/BFRNqKaeCbxvBFRNqIZeCb6Tx8EZG2Yhn4iYQO2oqItBXLwE/qLB0RkWPEMvBNY/giIsfocuCb2XAzm2dmb5nZajP7fDv7mJk9bma1ZrbCzMZ1td3OJBPQ2NSczSZERApOSQiv0Qh8yd2XmllPYImZveLub2XsMwU4K3hcCjwd/MyKytIkDUcU+CIimbrcw3f3Le6+NFjeC7wNDG2z21TgOU9ZBPQxsyFdbbsjFaVJDh5pytbLi4gUpFDH8M1sJHAJsLjNpqHApozndRz7j0JoKsuSNCjwRURaCS3wzawH8B/AF9x9zym+xjQzqzGzmm3btp1yLZWlSQ4eVuCLiGQKJfDNrJRU2D/v7v/Zzi6bgeEZz4cF61px9+nuXu3u1QMGDDjleio1pCMicowwztIx4IfA2+7+aAe7zQI+HZytcxlQ7+5butp2RyrKFPgiIm2FcZbOFcCngJVmtixY93VgBIC7PwPMBm4AaoEDwGdCaLdDlaVJGjSkIyLSSpcD391/D9hx9nHgnq62daK6lSXZd6gxV82JiBSEWF5p2797OXsaGjnUqF6+iEiLWAb+wF7lAOzYdzjiSkRE8kcsA79/9zJAgS8ikimWgd+7shSA+oNHIq5ERCR/xDLw+wY9/J0H1MMXEWkRy8Af0rsCgPfrD0ZciYhI/ohl4PcoL6G8JKExfBGRDLEMfDOjqkc52/YdiroUEZG8EcvAh9SwzuZdGtIREWkR28Af0a8b7+08EHUZIiJ5I7aBf1qfSj7Y00CTbm4rIgLEOPAH9a6g2WG7xvFFRIAYB/6wPpUAbNKwjogIEOPAP2dwTwDe3nJKN98SEYmd2Ab+kN4V9OtexsrN9VGXIiKSF2Ib+GbGhcN6s+TdXVGXIiKSF2Ib+ADjR/Vj/bb97NacOiIi8Q78i4f1AWB5nYZ1RERiHfjnD+uNGfzxPQ3riIjEOvB7VZQyZnAvFtZuj7oUEZHIxTrwAaacP5g3N+5i3Qd7oy5FRCRSsQ/8T1w6gpKE8cKbm6IuRUQkUrEP/Koe5Vw/djAzl9TR2NQcdTkiIpGJfeAD3HDBEOoPHuHNjTp4KyLFqygC/5oxA0gmjP/9Wm3UpYiIRKYoAr9bWQkj+nVjwbrtHDzcFHU5IiKRKIrAB7h9wukA/Pv89RFXIiISjaIJ/E9NGAnAY3PWcUQHb0WkCIUS+GY2w8y2mtmqDrZfbWb1ZrYseDwQRrsnI5kwvvyXZwPw3ZfW5Lp5EZHIhdXD/zEw+Tj7LHD3i4PHt0Jq96TcffWZAPz7/A0cONwYRQkiIpEJJfDdfT6wM4zXyqZEwvjGR88F4J7nl0ZcjYhIbuVyDH+CmS03s/8ys7Ht7WBm08ysxsxqtm3blpUi7rxyNL0qSpi3Zhsbt+/PShsiIvkoV4G/FDjd3S8CngD+b3s7uft0d6929+oBAwZkrZjpn64G4JZn/kBzs2etHRGRfJKTwHf3Pe6+L1ieDZSaWVUu2m7PZaP7c/O4YWzfd4i/e35JVGWIiORUTgLfzAabmQXL44N2d+Si7Y48fPMFlCaNl1Z/wO9WbYmyFBGRnAjrtMyfA38AzjGzOjP7rJndZWZ3BbvcAqwys+XA48Ct7h7pWEppMsGcf7wKgLv+z1ION+rcfBGJN4s4dztUXV3tNTU1WW/noVmr+fHrG+nXvYya+68jkbCstykiki1mtsTdq9vbVjRX2nbkwY+dx2m9K9i5/zBTn1oYdTkiIllT9IFvZiz46kRKk8bKzfU8+rKuwhWReCr6wIfUtAtL/mkSAI/PreXtLXsirkhEJHwK/ECvilKe/MQlAEz5Xwv48+6DEVckIhIuBX6Gv7rwND5zxUgALn94rmbVFJFYUeC38eDHxnLlWalrwj707Vd0H1wRiQ0Ffjue+5vxDOpVzp6GRq58ZB75euqqiMjJUOC3w8z4w33X0rO8hC31DVz+8FyaNOeOiBQ4BX4HEglj8f3X0rMiFfoXffNl3Q9XRAqaAr8T3cpKWPpPkxjap5J9hxo5/6GX2HdIN04RkcKkwD+O0mSC33/1Gs4a2IOmZqf6n19RT19ECpIC/wSYGS9/8SOMrupOw5FmPvwvc9ivnr6IFBgF/glqCf3T+3dj36FGxj74Ept2Hoi6LBGRE6bAPwklyQSv/uNVVJ/eF4ArH5nHn97XNAwiUhgU+CepJJngl3dN4O6rzwBg8mML+PWyzbpVoojkPQX+KTAzvjJ5DPdNGQPA519Yxt3P6yYqIpLfFPhdcNdVZzDzrgkA/G71+9zyzOs0HNEZPCKSnxT4XVQ9sh/LHphEwmBFXT0Tv/saW+o106aI5B8Ffgj6dCtj1TevZ8zgnvy5voEJ/zqX2St1Y3QRyS8K/JB0Kyth9j9cyV9XDwPg7ueX8uz8DRFXJSJylAI/RImE8cgtF/Hwxy9gYM9yHpuzljt+9AabdTMVEckDCvwsuHX8CL590/mcPbgnr63Zxhde+CM/WvhO1GWJSJEribqAuLp+7GAmjhnIJ3+wmLe37GFFXT0NR5q59tyBnD2oZ9TliUgRUg8/i0qTCV783AS+NfV8DjU2853f/YmHZq1mRd1unbMvIjmnwM+Bmy4Zypp/nszEMQN5ff0ObnxyIU/OXRd1WSJSZBT4OVJekuThmy9gxh3VDOxZzoyFG5n06P9jZV191KWJSJFQ4OfQwJ4VTBwziC9ffw5XnT2AdVv3MX3BBn62+D1W/1nBLyLZFUrgm9kMM9tqZqs62G5m9riZ1ZrZCjMbF0a7heqvq4fzxG2XUNWjnN8s/zNf/9VKvjJzRdRliUjMhdXD/zEwuZPtU4Czgsc04OmQ2i1YiYSx4CvXsPjr13LjRaex9oO9XP/9+Ux9aqHm2ReRrAgl8N19PrCzk12mAs95yiKgj5kNCaPtQlZZlmRQrwo+ednpXHfuIAb2Kmf5pt3MX7eNLfUHdf9cEQlVrs7DHwpsynheF6xrNeGMmU0j9T8ARowYkaPSojd+VD/Gj+rH1r0NjP+XV7n/V6mRsZ7lJbz5jeuoKE1GXKGIxEFeXXjl7tOB6QDV1dVFd0eRgT0rmHFHNVv3HKLm3V3MXFLH/LXb6N+jjIQZ5w/tTWlSx9lF5NTkKvA3A8Mzng8L1kkbE8cMAmBgr3JmLqlj2k+XpLd946PncueVo6MqTUQKXK4CfxZwr5m9AFwK1Lu75g/uxFVnD+QXn5uQvqHK3z5Xw7oP9rF+2z4AShMJhverxMyiLFNECkgogW9mPweuBqrMrA54ECgFcPdngNnADUAtcAD4TBjtxlkyYYwf1S/9vKpHOS/WbOLFmqOHQp647RI+dtFpUZQnIgUolMB399uOs92Be8Joq1g9++lq1m3dC8Dhxmb+58wVurOWiJwUS2Vx/qmurvaampqoy8hLzc3O6K/Ppm+3Uvp2L0uv/8T4ERrjFylyZrbE3avb26ZTPgpQImF8adLZXHFmFecN6cV5Q3pRf+AIc/+0NerSRCSP5dVpmXLi/v7as1o9/9QPF/N+fQOLNuxIrzPgwmF9qCzTefwiosCPjQE9ylmwbju3Tl/Uav3dV5/BVyaPiagqEcknCvyYePDGsdwS3EC9xb0/+yM79x+OqCIRyTcK/JjoXVnK5WdUtVrXq6KEjTv2M/dPH7Rab2Z8eGQ/epTrj1+kmOhvfIxV9Shn0YadLNpw7Lx2fz/xTL70l+dEUJWIREWBH2M/uL2ad3ccO9Xyp364mN0HjkRQkYhESYEfY326ldGnW9kx67uXl3CosSmCikQkSgr8IlRZmuQ3y7ewsHbHMdsuHNabpz/5oQiqEpFsU+AXoXsnnsnr648N+5V19bp4SyTGFPhF6OPjhvHxccOOWf/oy2t4Yl4t7q5ZOEViSFMrSFpZSQJ3ONKUn/MriUjXqIcvaeUlqSkYrvnuayQ66AoM6FHOz/72Mt12UaQAKfAlbdJ5g1i3dS+NHfTw3915gCXv7mL7vkMM69stx9WJSFcp8CVtZFV3Hrnlog63/3rZZpa8u4vDjc05rEpEwqIxfDlhLTdQ1xi/SGFSD19OWEvg7zvUmL7XbkfMjh4TEJH8oMCXE1YZHKi9+enXT2j/x/77xdx0ydBsliQiJ0GBLyfsw6P68s0bx7L/cONx933kd2vYsH1/DqoSkROlwJcTVl6S5PbLR57Qvt97eS2NTTq4K5JPdNBWsqIkYTQ26+CuSD5R4EtWlCYTHFEPXySvaEhHsqIkaRxubD7p0DegJKl+iEg2KPAlKypKkjy/+D2eX/zeSf2eGTx52zg+euGQLFUmUrwU+JIV/3rzBazeXH9Sv9PUDN+fs5Z3tu/LUlUixU2BL1lxzTkDueacgSf1O03NzvfnrEVD/yLZocFSyRuJYAr+pmYlvkg2hBL4ZjbZzNaYWa2Z3dfO9jvMbJuZLQsed4bRrsSLmel0TpEs6vKQjpklgaeASUAd8KaZzXL3t9rs+qK739vV9iTekgmjyRX4ItkQRg9/PFDr7hvc/TDwAjA1hNeVIlSSMJo0G6dIVoRx0HYosCnjeR1waTv73WxmHwHWAl90901tdzCzacA0gBEjRoRQmhSaRMJ4f08DK+p2h/J6vStLOb1/91BeS6TQ5eosnd8AP3f3Q2b2OeAnwMS2O7n7dGA6QHV1tbp5RahneQm/XbGF367YEtpr/v6r1+gOXSKEE/ibgeEZz4cF69LcfUfG0x8Aj4TQrsTQT++8lI0hzbK59L1dPDVvPXsONkLfUF5SpKCFEfhvAmeZ2ShSQX8r8InMHcxsiLu3dNluBN4OoV2JoTMG9OCMAT1Cea2m4GyfZh0EFgFCCHx3bzSze4GXgCQww91Xm9m3gBp3nwX8g5ndCDQCO4E7utquyPEkgxP7m3SapwgQ0hi+u88GZrdZ90DG8teAr4XRlsiJSgSBrx6+SIqutJXYSpgCXySTAl9iK2ktQzoRFyKSJxT4EluJ4NutHr5IigJfYis9pKODtiKAAl9iLH2Wjnr4IoDmw5cYa+nhL96wk70NjTlrt3/3Mi4d3T9n7YmcKAW+xFa/7mUAPDmvNudtv3n/dQzoWZ7zdkU6o8CX2BpV1Z2F901kXw579y+vfp/vvbKWhiNNOWtT5EQp8CXWhvapzGl7q4L7+OqwgeQjHbQVCZFOBZV8psAXCZGhq3slfynwRUIUnBiE4l7ykQJfJEQtp4K6eviShxT4IiE6OmFbxIWItEOBLxKi4OJejeFLXlLgi4QoPYavvJc8pMAXCZFpDn7JYwp8kRAdPWgbcSEi7VDgi4RIY/iSzxT4IiHSGL7kMwW+SIg0hi/5TIEvEiKdhy/5TLNlioSoZQx/ztsfULt1b7TFnKLh/bpx+RlVUZchWaDAFwlRVY9yzODp19ZHXcopK00aa749hUTLv14SGwp8kRCdO6QXS78xiYMFegOUHy18h2cXvKPJ32JKgS8Ssr7dy+gbdRGnqGdFKdAy+Zt6+HGjg7YiktYS8erhx1MogW9mk81sjZnVmtl97WwvN7MXg+2LzWxkGO2KSLh0HUG8dTnwzSwJPAVMAc4DbjOz89rs9llgl7ufCXwf+E5X2xWR8LVcRyDxFEYPfzxQ6+4b3P0w8AIwtc0+U4GfBMszgWtN3yyRvOUa1ImlMAJ/KLAp43ldsK7dfdy9EagH+rd9ITObZmY1Zlazbdu2EEoTkZOhIZ14y6uDtu4+3d2r3b16wIABUZcjUnRMZ+bEWhiBvxkYnvF8WLCu3X3MrAToDewIoW0RyQL18OMpjMB/EzjLzEaZWRlwKzCrzT6zgNuD5VuAua67PIvknfSQjsbwY6nLF165e6OZ3Qu8BCSBGe6+2sy+BdS4+yzgh8BPzawW2EnqHwURyTPp8/CV97EUypW27j4bmN1m3QMZyw3AfwujLRHJnqM9fImjvDpoKyLRajloqxHXeFLgi0iaro6JNwW+iBxD/ft4UuCLSFrLBfAa0YknBb6IpKVHdBT4saTAF5Fj6Dz8eFLgi0ia5tKJNwW+iKTpBijxpsAXkbSjB20V+XGkwBeRNF1pG28KfBFJ03VX8abAF5GjdB5+rCnwRSTt6EFbJX4cKfBFJM10mk6sKfBF5BjK+3hS4ItI2tHpkSMuRLJCgS8iabrFYbwp8EUkTbc4jDcFvoik6QYo8abAF5G09Bh+xHVIdoRyE3MRiYmgh/8/nl1EaVL9waiMGdKLJ267JPTXVeCLSNoVZ1Zx08WncbipOepSitrwvpVZeV0FvoikDe1TyWO3ht+zlPyg/7OJiBQJBb6ISJFQ4IuIFAkFvohIkehS4JtZPzN7xczWBT/7drBfk5ktCx6zutKmiIicmq728O8DXnX3s4BXg+ftOejuFwePG7vYpoiInIKuBv5U4CfB8k+Am7r4eiIikiVdDfxB7r4lWH4fGNTBfhVmVmNmi8zspi62KSIip+C4F16Z2RxgcDub7s984u5uZh1NwXG6u282s9HAXDNb6e7r22lrGjAteLrPzNYcr75OVAHbu/D7USvk+gu5dlD9UVP9XXN6RxvMuzAPahDIV7v7FjMbArzm7ucc53d+DPzW3WeecsMnVluNu1dns41sKuT6C7l2UP1RU/3Z09UhnVnA7cHy7cCv2+5gZn3NrDxYrgKuAN7qYrsiInKSuhr4DwOTzGwdcF3wHDOrNrMfBPucC9SY2XJgHvCwuyvwRURyrEuTp7n7DuDadtbXAHcGy68DF3SlnVM0PYI2w1TI9Rdy7aD6o6b6s6RLY/giIlI4NLWCiEiRUOCLiBSJ2AW+mU02szVmVmtmHU31EAkz22hmK4M5hWqCde3OR2QpjwfvY4WZjct4nduD/deZ2e0dtRdCvTPMbKuZrcpYF1q9Zvah4POoDX431Ftod1D/Q2a2OWNupxsytn0tqGWNmV2fsb7d75SZjTKzxcH6F82sLMTah5vZPDN7y8xWm9nng/UF8fl3Un+hfP4VZvaGmS0P6v9mZ22aWXnwvDbYPvJU31dWuXtsHkASWA+MBsqA5cB5UdeVUd9GoKrNukeA+4Ll+4DvBMs3AP9F6i6jlwGLg/X9gA3Bz77Bct8s1fsRYBywKhv1Am8E+1rwu1NyUP9DwJfb2fe84PtSDowKvkfJzr5TwC+AW4PlZ4C/C7H2IcC4YLknsDaosSA+/07qL5TP34AewXIpsDj4rNptE7gbeCZYvhV48VTfVzYfcevhjwdq3X2Dux8GXiA1308+62g+oqnAc56yCOhjqYvbrgdecfed7r4LeAWYnI3C3H0+sDMb9Qbbern7Ik/9zXiOkOdi6qD+jkwFXnD3Q+7+DlBL6vvU7ncq6A1PBFouIAx1Lil33+LuS4PlvcDbwFAK5PPvpP6O5Nvn7+6+L3haGjy8kzYz/1xmAtcGNZ7U+wqr/o7ELfCHApsyntfR+Zcs1xx42cyWWGoaCeh4PqKO3kvU7zGseocGy23X58K9wbDHDDs6pffJ1t8f2O3ujW3Why4YHriEVC+z4D7/NvVDgXz+ZpY0s2XAVlL/UK7vpM10ncH2+qDGvPp7HLfAz3d/4e7jgCnAPWb2kcyNQU+rYM6TLbR6A08DZwAXA1uA70VazXGYWQ/gP4AvuPuezG2F8Pm3U3/BfP7u3uTuFwPDSPXIx0RbUdfFLfA3A8Mzng8L1uUFd98c/NwK/IrUl+iD4L/XBD+3Brt39F6ifo9h1bs5WG67Pqvc/YPgL3Iz8CypPwOOU2d763eQGjYpabM+NGZWSiosn3f3/wxWF8zn3179hfT5t3D33aRmCZjQSZvpOoPtvYMa8+vvcbYPEuTyQerK4Q2kDo60HAgZG3VdQW3dgZ4Zy6+TGnv/N1ofhHskWP4orQ/CvRGs7we8Q+oAXN9guV8W6x5J64OeodXLsQcNb8hB/UMylr9IanwVYCytD65tIHVgrcPvFPBLWh/AuzvEuo3UuPpjbdYXxOffSf2F8vkPAPoEy5XAAuCvOmoTuIfWB21/carvK5uPrL54FA9SZyusJTXedn/U9WTUNTr4Q10OrG6pjdQ436vAOmBOxl9GA54K3sdKoDrjtf6G1MGfWuAzWaz556T+232E1BjjZ8OsF6gGVgW/8yTBld9Zrv+nQX0rSE3+lxlA9we1rCHjjJWOvlPBn+kbwfv6JVAeYu1/QWq4ZgWwLHjcUCiffyf1F8rnfyHwx6DOVcADnbUJVATPa4Pto0/1fWXzoakVRESKRNzG8EVEpAMKfBGRIqHAFxEpEgp8EZEiocAXESkSCnwRkSKhwBcRKRL/H9H3Dl83ydNfAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(list(map(lambda n: np.log(np.log(n)), sorted(words_count.values(), reverse=True))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "c4e53975",
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords =[ w for w, _ in sorted(words_count.items(), key=lambda kv:kv[1], reverse=True)[:300]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "06589872",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('datasets/stopwords.txt','w') as f:\n",
    "    for w in stopwords:\n",
    "        f.write(w + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5470f267",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73c024ef",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "3f462b4c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "1779003d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>label_desc</th>\n",
       "      <th>sentence</th>\n",
       "      <th>sentence_len</th>\n",
       "      <th>words</th>\n",
       "      <th>words_len</th>\n",
       "      <th>words_keep</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>108</td>\n",
       "      <td>news_edu</td>\n",
       "      <td>上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？</td>\n",
       "      <td>44</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "      <td>26</td>\n",
       "      <td>上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>104</td>\n",
       "      <td>news_finance</td>\n",
       "      <td>商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告</td>\n",
       "      <td>46</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "      <td>20</td>\n",
       "      <td>商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>106</td>\n",
       "      <td>news_house</td>\n",
       "      <td>通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？</td>\n",
       "      <td>32</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "      <td>21</td>\n",
       "      <td>通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>112</td>\n",
       "      <td>news_travel</td>\n",
       "      <td>2018年去俄罗斯看世界杯得花多少钱？</td>\n",
       "      <td>19</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "      <td>10</td>\n",
       "      <td>2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>109</td>\n",
       "      <td>news_tech</td>\n",
       "      <td>剃须刀的个性革新，雷明登天猫定制版新品首发</td>\n",
       "      <td>21</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "      <td>11</td>\n",
       "      <td>剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label    label_desc                                        sentence  \\\n",
       "0   0    108      news_edu    上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？   \n",
       "1   1    104  news_finance  商赢环球股份有限公司关于延期回复上海证券交易所对公司2017年年度报告的事后审核问询函的公告   \n",
       "2   2    106    news_house                通过中介公司买了二手房，首付都付了，现在卖家不想卖了。怎么处理？   \n",
       "3   3    112   news_travel                             2018年去俄罗斯看世界杯得花多少钱？   \n",
       "4   4    109     news_tech                           剃须刀的个性革新，雷明登天猫定制版新品首发   \n",
       "\n",
       "   sentence_len                                              words  words_len  \\\n",
       "0            44  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...         26   \n",
       "1            46  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...         20   \n",
       "2            32  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...         21   \n",
       "3            19                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？         10   \n",
       "4            21                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发         11   \n",
       "\n",
       "                                          words_keep  \n",
       "0  上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让...  \n",
       "1  商赢 环球 股份 有限公司 关于 延期 回复 上海证券交易所 对 公司 2017 年 年度报...  \n",
       "2  通过 中介 公司 买 了 二手房 ， 首付 都 付 了 ， 现在 卖家 不想 卖 了 。 怎...  \n",
       "3                       2018 年 去 俄罗斯 看 世界杯 得花 多少 钱 ？  \n",
       "4                    剃须刀 的 个性 革新 ， 雷明登 天猫 定制 版 新品 首发  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "1181acf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv('datasets/train_after_analysis.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "trader",
   "language": "python",
   "name": "trader"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
